from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np
import pandas as pd
import pyreadr


def build_vdem_baseline(vdem_rdata: Path, out_path: Path, baseline_year: int) -> pd.DataFrame:
    vdem = pyreadr.read_r(str(vdem_rdata))["vdem"]

    needed = [
        "country_name",
        "country_text_id",
        "year",
        "v2x_freexp_altinf",
        "v2x_freexp",
        "v2xme_altinf",
        "v2x_polyarchy",
        "v2x_libdem",
    ]
    existing = [c for c in needed if c in vdem.columns]
    v = vdem[existing].copy()

    v["year"] = pd.to_numeric(v["year"], errors="coerce").astype("Int64")
    v = v[v["year"].notna()].copy()

    v = v[(v["year"] <= baseline_year) & (v["year"] >= 1990)].copy()
    v = v.sort_values(["country_text_id", "year"])

    def last_non_null_before_baseline(group: pd.DataFrame) -> pd.Series:
        g = group[group["year"] <= baseline_year].copy()
        g = g.dropna(subset=["v2x_freexp_altinf"])
        if g.empty:
            # fall back: keep the last row even if missing freexp_altinf (so we can track missingness)
            last = group[group["year"] <= baseline_year].tail(1)
        else:
            last = g.tail(1)
        return last.iloc[0]

    baseline = v.groupby("country_text_id", as_index=False).apply(last_non_null_before_baseline, include_groups=False)
    baseline = baseline.rename(
        columns={
            "country_text_id": "iso3",
            "v2x_freexp_altinf": "media_freedom",
            "v2x_freexp": "freedom_expression",
            "v2xme_altinf": "alternative_info",
        }
    )
    baseline["baseline_year"] = baseline["year"].astype("Int64")
    baseline = baseline.drop(columns=["year"])

    baseline.replace([np.inf, -np.inf], np.nan, inplace=True)

    out_path.parent.mkdir(parents=True, exist_ok=True)
    baseline.to_csv(out_path, index=False, encoding="utf-8")
    return baseline


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--vdem", type=Path, default=Path("data/vdem/vdem.RData"))
    parser.add_argument("--out", type=Path, default=Path("outputs/data/vdem_baseline.csv"))
    parser.add_argument("--baseline-year", type=int, default=2019)
    args = parser.parse_args()

    baseline = build_vdem_baseline(args.vdem, args.out, args.baseline_year)
    print(f"Wrote {args.out} rows={len(baseline):,} cols={len(baseline.columns):,}")


if __name__ == "__main__":
    main()

